This is a notebook to read in the CLEAR and ancillary (3DHST) catalogs, select samples, and make some diagnostic plots

This uses pandas to store the catalogs as DataFrames. You need to use the following packages.


In [ ]:
import os
from glob import glob

import pandas as pd
import numpy as np

from astropy.table import Table
from astropy.io import fits

from IPython.display import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

%matplotlib inline

Set directories and test that they exist:


In [ ]:
cleardir = os.environ['HOME']+'/Data/CLEAR' # sets path to be $HOME/Data/CLEAR
# sets path to be $HOME/Data/CLEAR
# cleardir should include (available from the website): 
#    RELEASE_v1.0.0/
#    RELEASE_v1.0.0/CATALOGS/
# available from:  https://archive.stsci.edu/pub/clear_team/

threeddir = os.environ['HOME']+'/Data/3DHST/photometry'

# should include 
#    goodsn_3dhst.v4.1.cats/
#.      Catalog/                
#.      Eazy/ 
#       Fast/
#.      RF_colors/
#    goodss_3dhst.v4.1.cats/
#.      Catalog/                
#.      Eazy/ 
#       Fast/
#.      RF_colors/
# available from:  https://3dhst.research.yale.edu/Data.php


catdir = os.path.join(cleardir,'CATALOGS') 
# sets path for $HOME/Data/CLEAR/CATALOGS
# should include all files from CATALOGS/ directory at: 
#     https://archive.stsci.edu/pub/clear_team/CATALOGS/
#    

linedir = os.path.join(cleardir,'RELEASE_v1.0.0/CATALOGS')
# should include all files from RELEASE_v1.0.0/CATALOGS/ directory at:
#.    https://archive.stsci.edu/pub/clear_team/RELEASE_v1.0.0/CATALOGS/

combineddir = os.path.join(cleardir,'RELEASE_v1.0.0/COMBINED')
# shoudl include all files from https://archive.stsci.edu/pub/clear_team/RELEASE_v1.0.0/
# download the tar.gz files and untar them - I sent a .sh script that has suggested ways to download everything.

DOWNLOAD DATA from CLEAR archive site.

If you need to do this, for look for AAA_wget.sh as an example of how you can auto download everything from the CLEAR site. It looks like this:

$ more AAA_wget.sh

# this was a useful wget command to get all files

# Run these from a ~/Data/CLEAR/ directory (that's what Casey did): (you might need to "mkdir INCOMING" and the other directories first)

cd INCOMING && wget -r -nH --cut-dirs=4 --user='papovich@tamu.edu' --ask-password ftp://archive.stsci.edu//pub/clear_team/INCOMING/GoodsN_plus && cd ..

cd INCOMING && wget -r -nH --cut-dirs=4 --user='papovich@tamu.edu' --ask-password ftp://archive.stsci.edu//pub/clear_team/INCOMING/GoodsS_plus && cd ..

# FLT -- this will get everything... all 3.3 Gb worth or something

cd FLTS && wget -r -nH --cut-dirs=4 --user='papovich@tamu.edu' --ask-password ftp://archive.stsci.edu//pub/clear_team/FLTS/*.fits && cd ..

# (Might need to "mkdir RELEASE_v1.0.0" first... )

cd RELEASE_v1.0.0 && wget -r -nH --cut-dirs=4 --user='papovich@tamu.edu' --ask-password ftp://archive.stsci.edu//pub/clear_team/RELEASE_v1.0.0/* && cd ..

In [ ]:
### Check that all the directories we just set up do exist:

for d in [cleardir, catdir, linedir, threeddir,combineddir] :
    if os.path.exists(d) : print ("EXISTS: %s" % d)
    else : print("DOES NOT EXIST: %s" %  d)

We check if all GOODS-North catalogs are in place:


In [ ]:
gndfile = os.path.join(catdir,'goodsn_3dhst.v4.3.cat')
gndzoutfile = os.path.join(catdir,'goodsn_v4.3.zout')
gndlinefile = os.path.join(linedir,'GN_CLEAR.linefit.concat.v1.0.0.fits')
gndzfitfile = os.path.join(linedir,'GN_CLEAR.zfit.concat.v1.0.0.fits')
gndfoutfile = os.path.join(catdir,'goodsn_v4.3.fout')
#gndfoutfile = os.path.join(threeddir,'goodsn_3dhst.v4.1.cats','Fast','goodsn_3dhst.v4.1.fout')

# all need to exist:
for file in [gndfile, gndzoutfile, gndlinefile, gndzfitfile,gndfoutfile] : 
    if os.path.exists(file) : print ("EXISTS: %s" % file)
    else: print("DOES NOT EXIST: %s" %  d)

We check if all GOODS-South catalogs are in place:


In [ ]:
gsdfile = os.path.join(catdir,'goodss_3dhst.v4.3.cat')
gsdzoutfile = os.path.join(catdir,'goodss_v4.3.zout')
gsdlinefile = os.path.join(linedir,'GS_CLEAR.linefit.concat.v1.0.0.fits')
gsdzfitfile = os.path.join(linedir,'GS_CLEAR.zfit.concat.v1.0.0.fits')
gsdfoutfile = os.path.join(catdir,'goodss_v4.3.fout')
#gsdfoutfile = os.path.join(threeddir,'goodss_3dhst.v4.1.cats/','Fast','goodss_3dhst.v4.1.fout')

for file in [gndfile, gndzoutfile, gndlinefile, gndzfitfile,gndfoutfile] : 
    if os.path.exists(file) : print ("EXISTS: %s" % file)
    else: print("DOES NOT EXIST: %s" %  d)
The COMBINED Directory is expected to look like this: ``` COMBINED/ 1D/FITS 1D/PNG 2D/FITS 2D/PNG LINEFIT/CHAIN_PNG LINEFIT/DAT LINEFIT/FITS LINEFIT/PNG ZFIT/2D_FITS ZFIT/2D_PNG ZFIT/DAT ZFIT/FITS ZFIT/PNG ZFIT/PZ_FITS ZFIT/TILT_DAT ZFIT/TILT_PNG ``` The commands in the next In [] block should produce the output above:

In [ ]:
# The COMBINED Directory is expected to look like this: 
#for d in os.listdir(combineddir) : 
for dls in os.listdir(combineddir) : 
    if dls != ".DS_Store" : 
        for d in os.listdir(os.path.join(combineddir,dls)) : 
            if d != ".DS_Store" : 
                print(os.path.join(dls,d))

Next, read all the catalogs into PANDAS data frames

This is set up to read the ascii files. You can read them from FITS using astropy Table and pandas (that's probably easier in some ways):


In [ ]:
# define a routine to read in all the catalogs: 
def loadclear( catfile, zoutfile, foutfile, zfitfile, linefile, doprint=False) : 

    cat = Table.read(catfile, format='ascii').to_pandas()
    zcat = Table.read(zoutfile, format='ascii').to_pandas()
    fcat = Table.read(foutfile, format='ascii').to_pandas()
    
    zfitcat = Table.read(zfitfile).to_pandas()
    linecat = Table.read(linefile).to_pandas()
    
    return(cat, zcat, fcat, zfitcat, linecat)

Open all the catalogs. This may take a few seconds, we are opening all 10 files at once.


In [ ]:
gnd, gndz, gndf, gndzfit, gndline = loadclear(gndfile, gndzoutfile, gndfoutfile, 
                                              gndzfitfile, gndlinefile)
gsd, gsdz, gsdf, gsdzfit, gsdline = loadclear(gsdfile, gsdzoutfile, gsdfoutfile, 
                                              gsdzfitfile, gsdlinefile)

In the above, we now have

gnd: CLEAR photometric catalog for GOODS-N Deep. This is ID matched to 3DHST, but includes the HST WFC3 Y-band photometry.

gndz: CLEAR EAZY file (z-phot) based on the broad-band photometry.

gndf: CLEAR FOUT file (from EAZY) - has mass information, but something is odd with it...

gndzfit: CLEAR G102 grism redshift fits from Iva, NOT LINEMATCHED

gndlinefit: CLEAR G102 grism emission line identification using the zfit's, NOT LINEMATCHED

Similar files for gsd for GOODS-S Deep.

Below you can display some of these:


In [ ]:
# limit display to 10 rows:
pd.options.display.max_rows=10

# select only objects with "use==1" and display them to see some of their properties:
# scroll down to see all tables and to the right to see all columns
# Note: they all have the same number of rows!
ok = (gnd['use']==1)
display(gnd[ok],gndz[ok],gndf[ok])

In [ ]:
# show the concatenated z_grism and emission line tables:
# Note: these are much shorter
display(gndzfit,gndline)

Find an object with really high [OIII] flux and display some info and Images:


In [ ]:
#tok = (np.where((gndline['Hb_EQW'] > 10) & (gndline['OIII_EQW'] > 40)))[0]
tok = (gndline['Hb_EQW'] > 10) & (gndline['OIII_EQW'] > 40) & (gndline['Hb_EQW']/gndline['Hb_EQW_ERR'] > 3) & (gndline['OIII_EQW']/gndline['OIII_EQW_ERR'] > 3)
print("There are %d objects which fit these criteria" % (np.sum(tok)))
pd.options.display.max_rows=9999
display(gndline.loc[tok])
pd.options.display.max_rows=10

Examine files for a single object

From above, take the fourth entry above (line 39 in the full catalog), PHOT_ID=33115 in GND, print it's information and display the files associated with it:

If you want to see a bad emission line, change these to the object is line 31 (PHOT_ID = 32632).


In [ ]:
print(gndzfit['phot_id'][39])

In [ ]:
SearchID = 33115

ok = (np.where(gndzfit['phot_id'] == SearchID))
pd.options.display.max_rows=9999
display(gndzfit.loc[ok], gndline.loc[ok])

Let's find what files are available for this object in the directories.


In [ ]:
# List all the subdirectories available under the 'combineddir'
os.listdir(combineddir)

There are a total of 14 files for the combined spectrum of this object and out of the :


In [ ]:
### I will search all GN* directories for files associated with this object:
files_all = glob('%s/*/*/*%s*' % (combineddir, SearchID))


print('There are a total of %d files for the combined spectrum of this object.' % (len(files_all)))

### Uncommend the lines below if you want to list the files
#for file in files:
#    print(file)

Looks like this object is only found in the GN3 field.

There are several PNG files which are meant to be for diagnostic purposes.


In [ ]:
files_png = glob('%s/*/*/*%s*.png' % (combineddir, SearchID))
print('Found %d PNG files.' % len(files_png))
for file in files_png:
    print(file)

The first file in this list, the GN3-G102_32632_stack.png file, shows all the individual spectra that went into the stack. Let's look at it.


In [ ]:
Image(files_png[0])

The plot shows a row for each spectrum in the final stack with the bottom-most spectrum showing the final co-add. The left column shows the observed spectra, the middle one is the calculated contamination, the right one if flux minus contamination. The top two spectra (from pointings GDN18 and GDN22) are from the Barro GO program. The lines clearly show in all PAs which indicates that they are real lines.


In [ ]:
### Change 4 to 1,2,3,5 to see the other diagnostic plots
Image(files_png[4])

The plot above shows the (1) grism spectrum counts/s vs. lambda (2) same converted to flux vs lambda (3) p(z) vs z and (4) best fit SED, photometric points, spectrum.

The lines Hb and OIII are clearly visible. The p(z) matches the ground-based spec_z really well.

Now let's look at some of the FITS files and see where the data for these plots came from.


In [ ]:
files_fits = glob('%s/*/*/*%s*.fits' % (combineddir, SearchID))
print('Found %d FITS files.' % len(files_fits))
for file in files_fits:
    file = file.split('/')[-1]
    print(file)

In [ ]:
### This is the content of the 2D file, this is the file which contains the stack:
print(files_fits[1])
twod_spec = fits.open(files_fits[1])
twod_spec.info()
twod_spec[0].header

The 0th extension is header whith some information about the spectrum. View is with foo[0].header. All the spectra which went into the stack are in the TWOD keywords in this header.

Extensions 1,2,3 and 4 are postage stams from the science direct image, interlaced image, weight map and segmentation map respectively (hint: they are sqare so clearly not spectra, see the dimentions in the 4th column).

Extensions 5,6,7 and 8 are spectral 2D arrays containing the spectrum (INCLUDING CONTAMINATION), the error array, the model (of the spectrum itself) and the contramination model (of everything else) respectively.

Extension 9, 10 and 11 are 1D arrays containing the wavelength solution, the sensitivity and the trace (in y) at each pixel.


In [ ]:
plt.figure(figsize=(20,10))

plt.subplot(1, 4, 1)
plt.imshow(twod_spec[1].data)

plt.subplot(1, 4, 2)
plt.imshow(twod_spec[2].data)

plt.subplot(1, 4, 3)
plt.imshow(twod_spec[3].data)

plt.subplot(1, 4, 4)
plt.imshow(twod_spec[4].data)

In [ ]:
plt.figure(figsize=(20,10))

plt.subplot(5, 1, 1)
plt.imshow(twod_spec[5].data, vmin=0.0, vmax=0.01)

plt.subplot(5, 1, 2)
plt.imshow(twod_spec[5].data-twod_spec[8].data, vmin=0.0, vmax=0.01)

plt.subplot(5, 1, 3)
plt.imshow(twod_spec[6].data, vmin=0.0, vmax=0.01)

plt.subplot(5, 1, 4)
plt.imshow(twod_spec[7].data, vmin=0.0, vmax=0.01)

plt.subplot(5, 1, 5)
plt.imshow(twod_spec[8].data, vmin=0.0, vmax=0.01)

Now, restrict objects to only those in CLEAR and do several things.

Here we make a pandas data frame, matching by ID numbers only those objects in the CLEAR ZFIT and LINE catalogs:

We will use the pandas (pd) merge routine (pd.merge()). (You can also use pd.concat, but that matches line-by-line.). There are other routines to match by RA and Dec.

When the routine below is done, we have one dataframe with all the information from the other dataframes in it, but only for objects that appear in the CLEAR zfit and linefit catalogs.


In [ ]:
# could use pd.concat, but this matches line-by-line
# gsddf = pd.concat([gsd,gsdz],axis=1) 
#gsddf = pd.concat([gsddf,gsdf],axis=1)
# default is join='outer', which pads cells with NaNs (that's what you want)  
# join='inner' only keeps rows that are in both catalogs. 
# the axis=1 means to match by ID number
#
# The only caveat is if columns in the different dataframes have the same column name - not sure what happens then.
#display(gnddf)

# INSTEAD:  here we use PANDAS MERGE routine to match based on keys (on="key") that are the name of the columns.

def domerge(gnd, gndz, gndf, gndzfit, gndlinefit) : 
    # add 'id' to zfit and linefit so we can key off those. 
    # could also use .merge(left, right, left_on='phot_id', right_on='id').... 
    gndzfit['id'] = gndzfit['phot_id']
    gndlinefit['id'] = gndlinefit['phot_id']
    # 
    
    gnddf = gndzfit
    print("Initial number of objects with grism zfits: %i" % len(gnddf))


    for f in [gnd, gndz, gndf, gndlinefit] : 
        gnddf = pd.merge(gnddf,f,on='id',how='inner')

    print("Final number (includes objects with different line identifications: %i" % len(gnddf))
    print()
    return(gnddf)

print("The reason there are more objects in the final than initial is that about ~100 objects fall in regions of overlap between CLEAR fields\n ")
gnddf = domerge(gnd,gndz,gndf, gndzfit, gndline)
gsddf = domerge(gsd,gsdz,gsdf, gsdzfit, gsdline)

Duplicates:

One thing we can do is look for objects with duplicated ID numbers - these are objects that have coverage in 2 CLEAR pointings (e.g., they fall in GN3 and GN5) and so have two entries. We can look at those:


In [ ]:
# Let's look at the Duplicates - 
# this only finds objects that have the same "phot_id" and lists them:
ok = gnddf['id'].duplicated()
#pd.options.display.max_columns = 10
display(gnddf[ok])

In [ ]:
# Let's look at object 34077
ok=np.where((gnddf['phot_id_x']==34077))[0]
display(gnddf.iloc[ok])
#ok
#display(gndline[ok])

In [ ]:
sname = [gnddf['grism_id_x'][ok[i]].decode('ASCII') for i in range(len(ok))] 
sname

In [ ]:
fig, axes = plt.subplots(2,1, figsize=(50,25))
#for s, i, j in zip(sname,[0,1,0,1], [0,0,1,1]) : 
for s, i in zip(sname[1:3], np.arange(0,2)) : 
    img=mpimg.imread('%s/2D/PNG/%s_stack.png' % (combineddir,s))
    #img=mpimg.imread('%s/2D/PNG/%s_stack.png' % (combineddir, s))
    axes[i].imshow(img)

Compare z(grism) with z(phot) (latter is from broad-band fitting).


In [ ]:
pd.options.display.max_rows=2
pd.options.display.max_columns=9999
display(gnddf)
pd.options.display.max_rows=10

In [ ]:
# select "usable" objects: 
okn = ((gnddf.use == 1) & (gnddf.z_peak_grism > 0))
oks = ((gsddf.use == 1) & (gsddf.z_peak_grism > 0))
print(len(okn),len(oks))

In [ ]:
# Plot z(phot) against z(grism)
#mpl.rc('xtick', labelsize=20) 
#mpl.rc('ytick', labelsize=20) 
plt.rcParams.update({'font.size': 22})

fig, axes = plt.subplots(2,1, figsize=(15,15))
axes[0].scatter( gnddf['z_peak_phot'][okn], gnddf['z_peak_grism'][okn])
axes[1].scatter( gsddf['z_peak_phot'][oks], gsddf['z_peak_grism'][oks])
for ax, label in zip(axes,['GND','GSD']) : 
    ax.set_xlabel('z(phot)')
    ax.set_ylabel('z(grism)')
    ax.set_xlim([0,5])
    ax.set_ylim([0,5])
    ax.plot([0,5],[0,5],'-r')
    ax.text(0.5,4.5,label)

Now, read in 3DHST FAST catalogs, match to our objects, and plot H-alpha EW vs. stellar mass:


In [ ]:
gndfastfile = os.path.join(threeddir,
                           'goodsn_3dhst.v4.1.cats','Fast',
                           'goodsn_3dhst.v4.1.fout')
gsdfastfile = os.path.join(threeddir,
                           'goodss_3dhst.v4.1.cats','Fast',
                           'goodss_3dhst.v4.1.fout')
def readfast(fastfile,doprint=True) : 
    #fcolnames = getcol(fastfile, doprint=doprint)
    fcat = Table.read(fastfile, format='ascii').to_pandas()
    #fcat = pd.read_table(fastfile,comment='#', 
    #                     delim_whitespace=True, names=fcolnames)
    return(fcat)

gndfast = readfast(gndfastfile)
gsdfast = readfast(gsdfastfile)

In [ ]:
# match the CLEAR grism line catalogs with the 3DHST FAST catalogs based on ID number
gndEW = pd.merge(gndline,gndfast,on='id',how='inner')
gsdEW = pd.merge(gsdline,gndfast, on='id', how='inner')
pd.options.display.max_rows=2
display(gndEW)
pd.options.display.max_rows=10

In [ ]:
zmin = (8500./6563. - 1)
zmax = (12500./6563. - 1)
okn = ((gndEW.Ha_EQW > 1) & (gndEW.z_max_grism > zmin) & 
                            (gndEW.z_max_grism < zmax))
oks = ((gsdEW.Ha_EQW > 1) & (gsdEW.z_max_grism > zmin) & 
                            (gsdEW.z_max_grism < zmax))
print("# of GND galaxies with Ha and %4.2f < z < %4.2f = %i" %
      (zmin, zmax, len((np.where(okn==True))[0])))
print("# of GSD galaxies with Ha and %4.2f < z < %4.2f = %i" %
      (zmin, zmax, len((np.where(oks==True))[0])))

In [ ]:
plt.rcParams.update({'font.size': 22})

fig, axes = plt.subplots(2,1, figsize=(15,15))
axes[0].scatter( gndEW['lmass'][okn], gndEW['Ha_EQW'][okn])
axes[1].scatter( gsdEW['lmass'][oks], gsdEW['Ha_EQW'][oks])

#axes[1].scatter( gsddf['z_peak_phot'][oks], gsddf['z_peak_grism'][oks])
for ax, label in zip(axes,['GND','GSD']) : 
    ax.set_xlabel('log Mass')
    ax.set_ylabel(r'EW(H$\alpha$)')
    ax.set_ylim([1,1000])
    ax.set_yscale("log", nonposy='clip')
    ax.set_xlim([8,11.5])
#    #ax.plot([0,5],[0,5],'-r')
    ax.text(11,500,label)

In [ ]:


In [ ]: